In [1]:
import words
reload(words)
Out[1]:
In [67]:
CACHE = False
import samples
reload(samples)
data = samples.load_samples(["Keywords", "UK", "Georgia", "Mexico", "EU"], cache=CACHE)
keywords = samples.load_samples(["Keywords"], cache=CACHE)
canada = samples.load_samples(["Canada"], cache=CACHE)
moldova = samples.load_samples(["Moldova"], cache=CACHE)
unops = samples.load_samples(["UNOPS"], cache=CACHE)
entities = list( set(x['entity'] for x in keywords) )
entities
Out[67]:
In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import cross_validation
In [58]:
slices = {}
for i, row in enumerate(data):
slices.setdefault(row['sample'], []).append(i)
In [5]:
def organize_data(data):
organized = []
for k, headers in data.items():
for header in headers:
organized.append({'entity': k, 'header': header})
return organized
In [11]:
def length(df):
return df['header'].apply(len)
def word_count(df):
return df['header'].apply(lambda x: len(list(words.split_words(x))))
def header_in_entity(df):
return df['header'].str.lower().isin(df['entity'].str.lower())
def entity_in_header(df):
return df['entity'].str.lower().isin(df['header'].str.lower())
def entity_feature(name):
entity_set = set(x['header'] for x in data if x['entity'] == name)
def fn(x):
#print name, x, words.subsetness(x, entity_set)
try:
return words.subsetness(x, entity_set)
except:
return 0
def entity_feature(df):
return df['header'].apply( fn )
entity_feature.func_name = 'entity_%s' % name
return entity_feature
entity_features = [entity_feature(name) for name in entities]
In [11]:
In [85]:
from sklearn.ensemble import RandomForestClassifier
class Model(object):
def __init__(self, samples, outcome_key='entity', svm=RandomForestClassifier(n_estimators=10)):
self.samples = samples
self.svm = svm
self.frame = pd.DataFrame(self.samples)
self.outcome_key = outcome_key
self.features_built = set()
def test(self, features, iterations=5, train_size=0.35, test_size=.25, seed=0):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
rs = cross_validation.ShuffleSplit(len(X), n_iter=iterations, train_size=train_size, test_size=test_size, random_state=seed)
accuracies = []
for train_index, test_index in rs:
model = self.svm.fit(X.ix[train_index], y.ix[train_index])
actual = y.ix[test_index].values
predicted = model.predict(X.ix[test_index])
accuracies.append( self.score_model(actual, predicted) )
print "Avg Accuracy: %%%.2f" % np.mean(accuracies)
def test_sample(self, slice, features):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
model = self.svm.fit(X, y)
actual = y.ix[slice].values
predicted = model.predict(X.ix[slice])
accuracy = self.score_model(actual, predicted)
for i, a, p in zip(slice, actual, predicted):
print self.samples[i]['header'].ljust(50), a.ljust(20), p
print "Accuracy: %%%.2f" % accuracy
def test_data(self, data, features):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
model = self.svm.fit(X, y)
df = pd.DataFrame(data)
z = self.build(df, features)
actual = df.entity
predicted = model.predict(z)
accuracy = self.score_model(actual, predicted)
for dct, a, p in zip(data, actual, predicted):
print dct['header'].ljust(50), a.ljust(20), p
print "Accuracy: %%%.2f" % accuracy
def score_model(self, actual, predicted):
score_df = pd.DataFrame([actual, predicted], index=['actual', 'predicted']).T
correct = sum(score_df.actual == score_df.predicted)
incorrect = sum(score_df.actual != score_df.predicted)
total = correct + incorrect
accuracy = float(correct) / float(total) * 100
return accuracy
def predict(self, headers, features):
X = self.build(self.frame, features)
y = self.frame[self.outcome_key]
model = self.svm.fit(X, y)
data = [{'header': h, 'entity': '?'} for h in headers]
df = pd.DataFrame(data)
z = self.build(df, features)
self.df = df
self.z = z
predictions = model.predict(z)
return zip(headers, predictions)
def build(self, df, features):
result = pd.DataFrame()
for fn in features:
result[fn.func_name] = fn(df)
return result
model = Model(data)
In [13]:
model.test(features=[length, word_count])
In [14]:
model.test(features=[length, word_count, header_in_entity, entity_in_header])
In [86]:
model.test(features=[length, word_count, header_in_entity, entity_in_header] + entity_features)
In [16]:
model.test(features=entity_features)
In [65]:
model.test_sample(slices['Canada'], features= [length, word_count, header_in_entity, entity_in_header] + entity_features)
In [87]:
model.test_data(canada, features= [length, word_count, header_in_entity, entity_in_header] + entity_features)
In [37]:
model.df.join(model.z)
Out[37]:
In [19]:
model = Model(keywords)
results = model.predict(headers, features = [length, word_count, header_in_entity, entity_in_header] + entity_features)
for header, result in results:
print header.ljust(50), result
In [ ]:
[obj['header'] for obj in unops]